{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this code, we merge the job postings data. We keep the job key to merge with the duration data (note pre_class_postjune.py and others keep the job key for June 15th, 2021 on; for previous dates, the pre_class_temp retrieves it)\n",
    "Note: all files named \"class_key\" were created by Ale by merging the job-key to the pre-June 2021 data--she hadn't kept it originally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "#import plotly.express as px\n",
    "import os,sys\n",
    "import os.path\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Paths\n",
    "path_data = '/home/res/curr/pri/indeed_noc_data/canada_class' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to import the classified data\n",
    "def import_data(csv_name):\n",
    "    df = pd.read_csv(path_data + f'{csv_name}.csv').rename(columns = {u'job_sector':u'company_name'})\n",
    "    df['date_first_visible'] = pd.to_datetime(df['date_first_visible'])\n",
    "    try:\n",
    "        df.drop(df.filter(regex='Unnamed').columns, axis=1, inplace=True)\n",
    "    except:\n",
    "        pass\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "non Quebec until last date:\n",
      "Index(['job_key', 'job_title', 'date_first_visible', 'company_name', 'jlcity',\n",
      "       'jladmin1code', 'no_company', 'job_description', 'SOC_code'],\n",
      "      dtype='object')\n",
      "(1960452, 9)\n",
      "2023-01-01 00:00:00\n",
      "2023-08-25 00:00:00\n",
      "False    0.99242\n",
      "True     0.00758\n",
      "Name: company_name, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# Importing and appending Quebec and non-Quebec data\n",
    "df = import_data('non_quebec_classified')\n",
    "df_qc = import_data('quebec_classified')\n",
    "df = pd.concat([df,df_qc]) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['job_title', 'date_first_visible', 'job_sector', 'jlcity',\n",
      "       'jladmin1code', 'job_description', 'SOC_code', 'job_key', 'job_title.1',\n",
      "       'date_first_visible.1', 'company_name', 'jlcity.1', 'jladmin1code.1',\n",
      "       'no_company'],\n",
      "      dtype='object')\n",
      "(17960989, 14)\n",
      "2018-01-01 00:00:00\n",
      "2023-08-25 00:00:00\n",
      "False    0.994858\n",
      "True     0.005142\n",
      "Name: company_name, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# drop duplicates\n",
    "df.drop_duplicates(subset=['job_key','date_first_visible','job_title'],inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SAVE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False    0.999926\n",
      "True     0.000074\n",
      "Name: job_title, dtype: float64\n",
      "True     0.552191\n",
      "False    0.447809\n",
      "Name: job_title.1, dtype: float64\n",
      "False    1.0\n",
      "Name: date_first_visible, dtype: float64\n",
      "True     0.552188\n",
      "False    0.447812\n",
      "Name: date_first_visible.1, dtype: float64\n",
      "False    0.991085\n",
      "True     0.008915\n",
      "Name: jlcity, dtype: float64\n",
      "True     0.55723\n",
      "False    0.44277\n",
      "Name: jlcity.1, dtype: float64\n",
      "False    1.0\n",
      "Name: jladmin1code, dtype: float64\n",
      "True     0.552188\n",
      "False    0.447812\n",
      "Name: jladmin1code.1, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# Check duplicate columns\n",
    "print(df['job_title'].isna().value_counts(normalize=True))\n",
    "print(df['job_title.1'].isna().value_counts(normalize=True))\n",
    "print(df['date_first_visible'].isna().value_counts(normalize=True))\n",
    "print(df['date_first_visible.1'].isna().value_counts(normalize=True))\n",
    "print(df['jlcity'].isna().value_counts(normalize=True))\n",
    "print(df['jlcity.1'].isna().value_counts(normalize=True))\n",
    "print(df['jladmin1code'].isna().value_counts(normalize=True))\n",
    "print(df['jladmin1code.1'].isna().value_counts(normalize=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['job_title', 'date_first_visible', 'jlcity', 'jladmin1code', 'NOC',\n",
      "       'job_key', 'company_name', 'jobcountry'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# Clean columns\n",
    "df.drop(columns=['job_sector','job_description','job_title.1','date_first_visible.1','jlcity.1','jladmin1code.1','no_company'], inplace=True)\n",
    "df.rename(columns={'SOC_code':'NOC'},inplace=True)\n",
    "print(df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save\n",
    "df.to_csv(path_data+'jobpostingsclass.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.5 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "dcfde247205fe7f7734093e765e0e954956c446af54145a2a31cc272e0be5723"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
